# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup

with open('example_02.html') as f:
    content = f.read()

resp = requests.get('http://www.sina.com.cn')

soup = BeautifulSoup(resp.content, "html.parser")
print(resp.content)
exit()
#print(soup.select('#syncad_0  ul.list-a.news_top  li:nth-child(3)  a'))
link_list = soup.select('#syncad_0  ul.list-a.news_top li a')
for item in link_list:
    print(item.text)
    print(item.get('href'))

tar_url = link_list[5].get('href')

from goose3 import Goose 
from goose3.text import StopWordsChinese

g = Goose({'browser_user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36','stopwords_class': StopWordsChinese})
article = g.extract(url=tar_url)
print(article.title)
print(article.cleaned_text)